The objective of this notebook is to create a Seurat object, assess the quality of the data and filter out the low-quality cells. We followed the procedure described by a specific Signac’s vignette entitled “Joint RNA and ATAC analysis: 10x multiomic”.
library(Signac)
library(Seurat)
library(ggplot2)
library(ggpubr)
library(tidyverse)
library(plyr)
library(reshape2)
library(data.table)
library(GenomicRanges)
library(ensembldb)
library(EnsDb.Hsapiens.v86)
library(hdf5r)
library(stringr)
set.seed(173)
# Thresholds
TSS_enrichment <- 2
nucleosome_signal_atac <- 2
min_lib_size_atac<- 500
max_lib_size_atac <- 100000
min_lib_size_rna <- 550
max_lib_size_rna <- 40000
min_n_genes <- 250
max_n_genes <- 6000
max_pct_mt <- 20
# Paths
path_to_data_exp1 <- here::here("multiome/results/Experiment_1/")
path_to_data_exp2 <- here::here("multiome/results/Experiment_2/")
path_to_save <- here::here("multiome/results/R_objects/")
plot_histogram_qc <- function(df, x, x_lab) {
df %>%
ggplot(aes_string(x)) +
geom_histogram(bins = 100) +
labs(x = x_lab, y = "Number of Cells") +
theme_pubr()
}
Extraction of gene annotations from EnsDb using hg38 as the reference assembly.
annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
seqlevelsStyle(annotation) <- "UCSC"
genome(annotation) <- "hg38"
Signac uses information from three related input files (created using CellRanger ARC):
read_atac_data <- function(url_project,library_name) {
counts <- Seurat::Read10X_h5(filename = paste0(url_project, "filtered_feature_bc_matrix.h5"))
# create a Seurat object containing the scRNA adata
tonsil <- Seurat::CreateSeuratObject(
counts = counts$`Gene Expression`,
assay = "RNA")
# create a Seurat object containing the scATAC adata
tonsil[["ATAC"]] <- Signac::CreateChromatinAssay(
counts = counts$Peaks,
sep = c(":", "-"),
genome = "hg38",
fragments = paste0(url_project, "atac_fragments.tsv.gz"),
annotation = annotation)
tonsil$library_name = library_name
return(tonsil)
}
BCLL_9_T_1 <- read_atac_data(paste0(path_to_data_exp1,"co7dzuup_xuczw9vc/"),"BCLL_9_T_1")
BCLL_9_T_2 <- read_atac_data(paste0(path_to_data_exp1,"qmzb59ew_t11l8dzm/"),"BCLL_9_T_2")
BCLL_8_T_1 <- read_atac_data(paste0(path_to_data_exp1,"ulx1v6sz_8a2nvf1c/"),"BCLL_8_T_1")
BCLL_8_T_2 <- read_atac_data(paste0(path_to_data_exp1,"wdp0p728_jf6w68km/"),"BCLL_8_T_2")
BCLL_14_T_1 <- read_atac_data(paste0(path_to_data_exp2,"pd9avu0k_kf9ft6kk/"),"BCLL_14_T_1")
BCLL_14_T_2 <- read_atac_data(paste0(path_to_data_exp2,"vuuqir4h_wfkyb5v8/"),"BCLL_14_T_2")
BCLL_15_T_1 <- read_atac_data(paste0(path_to_data_exp2,"admae8w2_89i88tvv/"),"BCLL_15_T_1")
BCLL_15_T_2 <- read_atac_data(paste0(path_to_data_exp2,"sr20954q_yiuuoxng/"),"BCLL_15_T_2")
BCLL_2_T_1 <- read_atac_data(paste0(path_to_data_exp2,"kmbfo1ab_ie02b4ny/"),"BCLL_2_T_1")
BCLL_2_T_2 <- read_atac_data(paste0(path_to_data_exp2,"ryh4el3i_biv0w7ca/"),"BCLL_2_T_2")
BCLL_2_T_3 <- read_atac_data(paste0(path_to_data_exp2,"bs2e7lr7_mdfwypvz/"),"BCLL_2_T_3")
libraries <- c(BCLL_9_T_1,BCLL_9_T_2,BCLL_8_T_1,BCLL_8_T_2,
BCLL_14_T_1,BCLL_14_T_2,BCLL_15_T_1,BCLL_15_T_2,
BCLL_2_T_1,BCLL_2_T_2,BCLL_2_T_3)
We can compute per-cell quality metrics taking in account three specifics from DNA accessibility such as TSS.enrichment, nucleosome signal and nCount_ATAC and nCount_RNA from gene expression data.
quality_control <- function(seurat_object) {
DefaultAssay(seurat_object) <- "ATAC"
seurat_object <- NucleosomeSignal(object = seurat_object)
seurat_object <- TSSEnrichment(object = seurat_object, fast = FALSE)
seurat_object$high.tss <- ifelse(seurat_object$TSS.enrichment > 2, "High", "Low")
DefaultAssay(seurat_object) <- "RNA"
seurat_object[["pct_mt"]] <- PercentageFeatureSet(seurat_object, pattern = "^MT-")
seurat_object[["pct_ribosomal"]] <- PercentageFeatureSet(seurat_object, pattern = "^RPS")
return(seurat_object)
}
all_data = lapply(libraries, quality_control)
metadata_QC = data.frame()
for (dat in all_data)
{
metadata_QC = rbind(metadata_QC,dat@meta.data)
}
saveRDS(all_data,paste0(path_to_save,"1.tonsil_multiome_QC.rds"))
np <- ggviolin(metadata_QC,
x = "library_name", fill = "gray70", x.text.angle = 45,
y = "nFeature_ATAC") + scale_y_log10()
np
print(paste("The median of total number of peaks is:", median(metadata_QC$nFeature_ATAC)))
## [1] "The median of total number of peaks is: 3331"
print("The summary of total number of peaks per library")
## [1] "The summary of total number of peaks per library"
aggregate(nFeature_ATAC ~ library_name, data = metadata_QC, median)
## library_name nFeature_ATAC
## 1 BCLL_14_T_1 5009.0
## 2 BCLL_14_T_2 5001.0
## 3 BCLL_15_T_1 4264.0
## 4 BCLL_15_T_2 4160.0
## 5 BCLL_2_T_1 1611.0
## 6 BCLL_2_T_2 1598.5
## 7 BCLL_2_T_3 2223.0
## 8 BCLL_8_T_1 4656.0
## 9 BCLL_8_T_2 4355.0
## 10 BCLL_9_T_1 5552.0
## 11 BCLL_9_T_2 5461.5
Quantification of the ratio of mononucleosomal (fragment lengths between 147 and 294bp) to nucleosome-free fragments(less than 147).The red dashed line represent the upper threshold applied.
nbp <- function(seurat_object) {
DefaultAssay(seurat_object) <- "ATAC"
f <- FragmentHistogram(object = seurat_object)
NFR <- length(which(f$data$length < 147)) / nrow(f$data) * 100
MONO <- length(which(f$data$length > 147 & f$data$length < 294)) / nrow(f$data) * 100
DI <- length(which(f$data$length > 294)) / nrow(f$data) * 100
min.threshold <- 147
max.threshold <- 294
# options(repr.plot.width=7, repr.plot.height=2)
options(repr.plot.width = 17, repr.plot.height = 8)
p <- ggplot(f$data, aes(length)) + ggtitle(unique(seurat_object$library_name)) +
geom_histogram(binwidth = 1, alpha = .5, fill = "blue") +
geom_density(aes(y = ..count..), bw = 1, alpha = 0, col = "black", lwd = 1) +
scale_x_continuous(limits = c(0, 500)) +
geom_vline(xintercept = c(min.threshold, max.threshold)) +
theme_minimal() +
geom_vline(xintercept = 39, color = "red") +
geom_text(x = 80, y = 20, label = round(NFR, 2), size = 8) +
geom_text(x = 200, y = 20, label = round(MONO, 2), size = 8) +
geom_text(x = 350, y = 20, label = round(DI, 2), size = 8)
print(p)
}
plot_nbp = lapply(all_data, nbp)
plot_nbp
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
ns <- ggviolin(metadata_QC,
x = "library_name", fill = "gray70", x.text.angle = 45,
y = "nucleosome_signal") + scale_y_log10() + geom_hline(yintercept = nucleosome_signal_atac, linetype='dashed', col = 'black')
ns
Here, we can see the plot of the normalized TSS enrichment score at each position relative to the TSS. The red dashed line represent the upper threshold applied.
plot_tss <- function(seurat_object){
DefaultAssay(seurat_object) <- "ATAC"
Signac::TSSPlot(seurat_object, group.by = "high.tss") +
ggtitle(paste("TSS enrichment score", unique(seurat_object$library_name)))
}
plots_tss_tonsil = lapply(all_data, plot_tss)
plots_tss_tonsil
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
tss <- ggviolin(metadata_QC,
x = "library_name", fill = "gray70", x.text.angle = 45,
y = "TSS.enrichment") + geom_hline(yintercept = (2), linetype='dashed', col = 'black') + scale_y_log10()
tss
The red dashed lines represent the lower and the upper thresholds applied.
ns <- ggviolin(metadata_QC,
x = "library_name", fill = "gray70", x.text.angle = 45,
y = "nCount_ATAC") + scale_y_log10() +
geom_hline(yintercept = c(min_lib_size_atac,max_lib_size_atac), linetype='dashed', col = 'black')
ns
lib_size_hist <- metadata_QC %>%
plot_histogram_qc(x = "nCount_ATAC", x_lab = "Library Size (log10)") +
geom_vline(xintercept = min_lib_size_atac, linetype = "dashed", color = "black")
lib_size_hist1 <- lib_size_hist +
scale_x_log10() +
geom_vline(xintercept = max_lib_size_atac, linetype = "dashed", color = "black")
lib_size_hist2 <- lib_size_hist +
scale_x_continuous(limits = c(0, 5000)) +
xlab("Library Size") +
theme_pubr()
lib_size_hist1 + lib_size_hist2
We aim to detect and exclude empty droplets or lysed cells. Lysed cells have 3 hallmarks: (1) low library size (total UMI), (2) low library complexity (number of detected genes) and (3) high fraction of mitochondrial expression (cytosolic mRNA leaks out of the cell). Let us start by visualizing their univariate distributions.
The red dashed lines represent the lower and the upper thresholds applied.
nc <- ggviolin(metadata_QC,
x = "library_name", fill = "library_name", x.text.angle = 45,
y = "nCount_RNA") + scale_y_log10() +
geom_hline(yintercept = c(min_lib_size_rna,max_lib_size_rna), linetype='dashed', col = 'black')
nc
lib_size_hist <- metadata_QC %>%
plot_histogram_qc(x = "nCount_RNA", x_lab = "Library Size (log10(total UMI))") +
geom_vline(xintercept = min_lib_size_rna, linetype = "dashed", color = "black")
lib_size_hist1 <- lib_size_hist +
scale_x_log10() +
geom_vline(xintercept = max_lib_size_rna, linetype = "dashed", color = "black")
lib_size_hist2 <- lib_size_hist +
scale_x_continuous(limits = c(0, 2000)) +
xlab("Library Size (total UMI)") +
theme_pubr()
lib_size_hist1 + lib_size_hist2
The red dashed lines represent the lower and upper thresholds applied. However, cells with a value higher than the upper threshold will be kept for the downstream analysis.
metadata_QC$has_high_lib_size <-
metadata_QC$nCount_RNA > max_lib_size_rna |
metadata_QC$nFeature_RNA > max_n_genes
ns <- ggviolin(metadata_QC,
x = "library_name", fill = "gray70", x.text.angle = 45,
y = "nFeature_RNA") + scale_y_log10() +
geom_hline(yintercept = c(min_n_genes,max_n_genes), linetype='dashed', col = 'black')
ns
n_genes_hist1 <- metadata_QC %>%
plot_histogram_qc(x = "nFeature_RNA", x_lab = "Number of Detected Genes") +
geom_vline(xintercept = min_n_genes, linetype = "dashed", color = "black") +
geom_vline(xintercept = max_n_genes, linetype = "dashed", color = "black")
n_genes_hist2 <- n_genes_hist1 +
scale_x_continuous(limits = c(0, 2000))
n_genes_hist1 + n_genes_hist2
print(paste("The median of total number of genes is:", median(metadata_QC$nFeature_RNA)))
## [1] "The median of total number of genes is: 1764"
print("The summary of total number of genes per library")
## [1] "The summary of total number of genes per library"
aggregate(nFeature_RNA ~ library_name, data = metadata_QC, median)
## library_name nFeature_RNA
## 1 BCLL_14_T_1 1736
## 2 BCLL_14_T_2 1659
## 3 BCLL_15_T_1 1713
## 4 BCLL_15_T_2 1895
## 5 BCLL_2_T_1 1291
## 6 BCLL_2_T_2 1584
## 7 BCLL_2_T_3 1586
## 8 BCLL_8_T_1 2318
## 9 BCLL_8_T_2 2117
## 10 BCLL_9_T_1 1997
## 11 BCLL_9_T_2 2142
ggscatter(metadata_QC, x = "nCount_RNA", y = "nFeature_RNA",
color = "library_name")
pct_mt_hist <- metadata_QC %>%
plot_histogram_qc(x = "pct_mt", x_lab = "% Mitochondrial Expression") +
geom_vline(xintercept = max_pct_mt, linetype = "dashed", color = "black") +
scale_x_continuous(limits = c(0, 100))
pct_mt_hist
Let us also calculate and visualize the proportion of ribosomal expression:
metadata_QC %>% plot_histogram_qc(x = "pct_ribosomal", x_lab = "% Ribosomal Expression")
Finally we remove cells that are outliers for these QC metrics.
filtering_QC <- function(seurat_object) {
seurat_object <- subset(
x = seurat_object,
nCount_ATAC < max_lib_size_atac &
pct_mt < max_pct_mt &
nCount_ATAC > min_lib_size_atac &
nCount_RNA > min_lib_size_rna &
nFeature_RNA > min_n_genes &
nucleosome_signal < nucleosome_signal_atac &
TSS.enrichment > TSS_enrichment)
return(seurat_object)
}
tonsil_data_filtered = lapply(all_data, filtering_QC)
saveRDS(tonsil_data_filtered,paste0(path_to_save,"2.tonsil_multiome_filtered.rds"))
print(paste("Number of total filtered cells:", sum(melt(lapply(tonsil_data_filtered, ncol))$value)))
## [1] "Number of total filtered cells: 69118"
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS/LAPACK: /Users/pauli/opt/anaconda3/envs/Tonsil_atlas/lib/libopenblasp-r0.3.10.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] hdf5r_1.3.3 EnsDb.Hsapiens.v86_2.99.0 ensembldb_2.12.1 AnnotationFilter_1.12.0 GenomicFeatures_1.40.1 AnnotationDbi_1.50.3 Biobase_2.48.0 GenomicRanges_1.40.0 GenomeInfoDb_1.24.0 IRanges_2.22.1 S4Vectors_0.26.0 BiocGenerics_0.34.0 data.table_1.13.2 reshape2_1.4.4 plyr_1.8.6 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2 purrr_0.3.4 readr_1.4.0 tidyr_1.1.2 tibble_3.0.4 tidyverse_1.3.0 ggpubr_0.4.0 ggplot2_3.3.2 Seurat_3.9.9.9010 Signac_1.1.0.9000 BiocStyle_2.16.1
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.1 SnowballC_0.7.0 rtracklayer_1.48.0 GGally_2.0.0 bit64_4.0.5 knitr_1.30 irlba_2.3.3 DelayedArray_0.14.0 rpart_4.1-15 RCurl_1.98-1.2 generics_0.1.0 cowplot_1.1.0 RSQLite_2.2.1 RANN_2.6.1 future_1.20.1 bit_4.0.4 spatstat.data_1.4-3 xml2_1.3.2 lubridate_1.7.9 httpuv_1.5.4 SummarizedExperiment_1.18.1 assertthat_0.2.1 xfun_0.18 hms_0.5.3 evaluate_0.14 promises_1.1.1 fansi_0.4.1 progress_1.2.2 dbplyr_1.4.4 readxl_1.3.1 igraph_1.2.6 DBI_1.1.0 htmlwidgets_1.5.2 reshape_0.8.8 ellipsis_0.3.1 backports_1.2.0 bookdown_0.21 biomaRt_2.44.4 deldir_0.2-3 vctrs_0.3.4 here_1.0.1 ROCR_1.0-11
## [43] abind_1.4-5 withr_2.3.0 ggforce_0.3.2 BSgenome_1.56.0 checkmate_2.0.0 sctransform_0.3.1 GenomicAlignments_1.24.0 prettyunits_1.1.1 goftest_1.2-2 cluster_2.1.0 lazyeval_0.2.2 crayon_1.3.4 labeling_0.4.2 pkgconfig_2.0.3 tweenr_1.0.1 nlme_3.1-150 ProtGenerics_1.20.0 nnet_7.3-14 rlang_0.4.8 globals_0.13.1 lifecycle_0.2.0 miniUI_0.1.1.1 BiocFileCache_1.12.1 modelr_0.1.8 rsvd_1.0.3 dichromat_2.0-0 rprojroot_2.0.2 cellranger_1.1.0 polyclip_1.10-0 matrixStats_0.57.0 lmtest_0.9-38 graph_1.66.0 Matrix_1.2-18 ggseqlogo_0.1 carData_3.0-4 zoo_1.8-8 reprex_0.3.0 base64enc_0.1-3 ggridges_0.5.2 png_0.1-7 viridisLite_0.3.0 bitops_1.0-6
## [85] KernSmooth_2.23-17 Biostrings_2.56.0 blob_1.2.1 parallelly_1.21.0 jpeg_0.1-8.1 rstatix_0.6.0 ggsignif_0.6.0 scales_1.1.1 memoise_1.1.0 magrittr_1.5 ica_1.0-2 zlibbioc_1.34.0 compiler_4.0.3 RColorBrewer_1.1-2 fitdistrplus_1.1-1 Rsamtools_2.4.0 cli_2.1.0 XVector_0.28.0 listenv_0.8.0 patchwork_1.1.0 pbapply_1.4-3 htmlTable_2.1.0 Formula_1.2-4 MASS_7.3-53 mgcv_1.8-33 tidyselect_1.1.0 stringi_1.5.3 yaml_2.2.1 askpass_1.1 latticeExtra_0.6-29 ggrepel_0.8.2 grid_4.0.3 VariantAnnotation_1.34.0 fastmatch_1.1-0 tools_4.0.3 future.apply_1.6.0 rio_0.5.16 rstudioapi_0.12 foreign_0.8-80 lsa_0.73.2 gridExtra_2.3 farver_2.0.3
## [127] Rtsne_0.15 digest_0.6.27 BiocManager_1.30.10 shiny_1.5.0 Rcpp_1.0.5 car_3.0-10 broom_0.7.2 later_1.1.0.1 RcppAnnoy_0.0.16 OrganismDbi_1.30.0 httr_1.4.2 ggbio_1.36.0 biovizBase_1.36.0 colorspace_2.0-0 rvest_0.3.6 XML_3.99-0.3 fs_1.5.0 tensor_1.5 reticulate_1.18 splines_4.0.3 uwot_0.1.8.9001 RBGL_1.64.0 RcppRoll_0.3.0 spatstat.utils_1.17-0 plotly_4.9.2.1 xtable_1.8-4 jsonlite_1.7.1 spatstat_1.64-1 R6_2.5.0 Hmisc_4.4-1 pillar_1.4.6 htmltools_0.5.0 mime_0.9 glue_1.4.2 fastmap_1.0.1 BiocParallel_1.22.0 codetools_0.2-17 lattice_0.20-41 curl_4.3 leiden_0.3.5 zip_2.1.1 openxlsx_4.2.3
## [169] openssl_1.4.3 survival_3.2-7 rmarkdown_2.5 munsell_0.5.0 GenomeInfoDbData_1.2.3 haven_2.3.1 gtable_0.3.0